package com.kdtech.analyse.Bbs;
import com.kdtech.crawler.at.UrlArgumentTop;
import com.kdtech.utils.DateUtils;
import com.kdtech.utils.DoMainUtils;
import com.kdtech.utils.HtmlCleaner;
import com.kdtech.utils.NumberUtils;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.kdtech.crawler.CrawlHTML;
import com.kdtech.entity.crawler.UrlMeta;
import com.kdtech.entity.data.NewsMeta;
import com.kdtech.analyse.AnalyseNews;
import com.kdtech.utils.RegexUtils;
import com.kdtech.utils.StringUtils;
/**
 *
 * 7991
 * http://bbs.cnool.net/ 东方论坛
 * @author KK
 */
public class CnoolBbsAnalyse implements AnalyseNews {

	
	public boolean isDetailPage(String url) {
		String[] regex = {
				"http://bbs.cnool.net/cthread-[0-9]*.html",
				};
		return RegexUtils.matchAny(url, regex);
	}

	public NewsMeta parserHtml(UrlMeta urlMeta) {

		if (urlMeta.getHtml() == null) {
		}
		String htmltxt = urlMeta.getHtml();
		String url = urlMeta.getUrl();

		String title = null;
		String content = null;
		Long date = null;
		Integer commnetNum = null;
		Integer clickNum = null;
		String updateUrl = url;
		String author = null;
		Document doc = Jsoup.parse(htmltxt);

		title = doc.select("title").text();
		if (title!=null){
			title = StringUtils.substringBefore(title, " - ");
		}
		Element firstDiv = doc.select("div#view-bd > div[id]").first();
		if (firstDiv!=null){
			date = DateUtils.matchDate(firstDiv.select("div.cont-hd").text());
			author = firstDiv.select("a.user-name").text();
			Elements element = firstDiv.select("div.cont-bd");
			element.select("div[style*=display: none]").remove();
			content = HtmlCleaner.getContentHtml(url,  element);
		}

		String threadId = StringUtils.substringBetween(url, "cthread-", ".html");
		String commTxt = CrawlHTML.GetHtml("http://bbs.cnool.net/util/thread/loadtopicviews.aspx?infloat=1&inajax=1&action=load&var=__loadtopicviews&_=" + System.currentTimeMillis() + "&topicid=" + threadId);
		//var __loadtopicviews={"success":true,"code":200,"data":{"TopicId":"104715669","TopicViews":"72654","Replies":"157"}}
		if(commTxt!=null){
			clickNum= NumberUtils.matchNumber(StringUtils.substringAfter(commTxt, "TopicViews"));
			commnetNum= NumberUtils.matchNumber(StringUtils.substringAfter(commTxt, "Replies"));
		}


			NewsMeta news = new NewsMeta();
			news.setUrl(url);
			news.setType(4);
			news.setAuthor(author);
			news.setTitle(StringUtils.trimSpace(title));
			news.setCommentNum(commnetNum);
			news.setClickNum(clickNum);
			news.setContent(StringUtils.trimSpace(content));
			news.setDate(date);
			news.setUpdateUrl(updateUrl);
			return news;

	}

	
	public NewsMeta Update(NewsMeta meta) {
		String url = meta.getUpdateUrl();
		if (StringUtils.isNotBlank(url)){
			UrlMeta urlMeta = CrawlHTML.responseToURL(url);
			return parserHtml(urlMeta);
		}
		return null;
	}

	

	

}
